import numpy as np
import pandas as pd
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import accuracy_score,r2_score, mean_absolute_error,mean_squared_error
data = pd.DataFrame(data = [['Old','Yes','Soft Skill','Rejected'],
['Middle','No','Hard Skill','Selected'],
['Middle','Yes','Soft Skill','Rejected'],
['Young','No','Hard Skill','Selected'],
['Middle','Yes','Hard Skill','Rejected'],
['Young','No','Soft Skill','Selected'],
['Young','Yes','Soft Skill','Selected'],
['Old','No','Soft Skill','Rejected'],
['Old','No','Hard Skill','Rejected'],
['Middle','No','Soft Skill','Selected']],
columns=['Age Group','Certified','Skill Type','Status'])
data
| Age Group | Certified | Skill Type | Status | |
|---|---|---|---|---|
| 0 | Old | Yes | Soft Skill | Rejected |
| 1 | Middle | No | Hard Skill | Selected |
| 2 | Middle | Yes | Soft Skill | Rejected |
| 3 | Young | No | Hard Skill | Selected |
| 4 | Middle | Yes | Hard Skill | Rejected |
| 5 | Young | No | Soft Skill | Selected |
| 6 | Young | Yes | Soft Skill | Selected |
| 7 | Old | No | Soft Skill | Rejected |
| 8 | Old | No | Hard Skill | Rejected |
| 9 | Middle | No | Soft Skill | Selected |
Design a function named find_entropy in python for finding the entropy of the attributes given in the above dataset.
def find_entropy(col,indicator):
if indicator == "X":
values = data[col].unique()
tot = 0
for value in values:
total = len(data[(data[col] == value)])
class1 = len(data[(data[col] == value) & (data['Status'] == 'Selected')])
class2 = len(data[(data[col] == value) & (data['Status'] == 'Rejected')])
if class1 != 0 and class2 != 0:
tot += (total/len(data)) * (-((class1/total)*math.log2(class1/total)+(class2/total)*math.log2(class2/total)))
else:
values = data[col].unique()
total = len(data)
class1 = len(data[(data[col] == values[0])])
class2 = len(data[(data[col] == values[1])])
tot = (-((class1/total)*math.log2(class1/total)+(class2/total)*math.log2(class2/total)))
return tot
Design a function named find_gain in python for finding the information gain of the attributes given in the above dataset w.r.t to the ‘Status’ attribute.
dtc = DecisionTreeClassifier(criterion = "entropy")
le = LabelEncoder()
data.iloc[:,0] = le.fit_transform(data.iloc[:,0])
data.iloc[:,1] = le.fit_transform(data.iloc[:,1])
data.iloc[:,2] = le.fit_transform(data.iloc[:,2])
data.iloc[:,3] = le.fit_transform(data.iloc[:,3])
x = data.iloc[:,:-1]
y = data.iloc[:,-1]
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=5,test_size=0.2)
dtc.fit(x_train,y_train)
y_pred = dtc.predict(x_test)
Entropy of Certified is: 0.8754887502163469 Entropy of Age Group is: 0.4 Entropy of Skill Type is: 1.0 Entropy of Status is: 1.0
print("Info. Gain of Certified is: ",find_gain('Certified'))
print("Info. Gain of Age Group is: ",find_gain('Age Group'))
print("Info. Gain of Skill Type is: ",find_gain('Skill Type'))
Info. Gain of Certified is: 0.12451124978365313 Info. Gain of Age Group is: 0.6 Info. Gain of Skill Type is: 0.0
dtc = DecisionTreeClassifier(criterion = "entropy")
le = LabelEncoder()
data.iloc[:,0] = le.fit_transform(data.iloc[:,0])
data.iloc[:,1] = le.fit_transform(data.iloc[:,1])
data.iloc[:,2] = le.fit_transform(data.iloc[:,2])
data.iloc[:,3] = le.fit_transform(data.iloc[:,3])
x = data.iloc[:,:-1]
y = data.iloc[:,-1]
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=5,test_size=0.2)
dtc.fit(x_train,y_train)
y_pred = dtc.predict(x_test)
print("Accuracy score of Decision Tree Classfier is: ",accuracy_score(y_true = y_test,y_pred = y_pred))
Accuracy score of Decision Tree Classfier is: 1.0
tree.plot_tree(dtc,filled=True)
plt.show()
titanic_data = pd.read_csv("train.csv")
titanic_data['Age'] = titanic_data['Age'].fillna(titanic_data['Age'].median())
titanic_data.drop(['Name'],axis =1, inplace=True)
titanic_data.drop(['Ticket'],axis =1, inplace=True)
titanic_data.drop(['PassengerId'],axis =1, inplace=True)
titanic_data.drop(['Cabin'],axis =1, inplace=True)
titanic_data['Embarked'] = titanic_data['Embarked'].fillna('A')
titanic_data.iloc[:,2] = le.fit_transform(titanic_data.iloc[:,2])
titanic_data.iloc[:,7] = le.fit_transform(titanic_data.iloc[:,7])
titanic_data.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 22.0 | 1 | 0 | 7.2500 | 3 |
| 1 | 1 | 1 | 0 | 38.0 | 1 | 0 | 71.2833 | 1 |
| 2 | 1 | 3 | 0 | 26.0 | 0 | 0 | 7.9250 | 3 |
| 3 | 1 | 1 | 0 | 35.0 | 1 | 0 | 53.1000 | 3 |
| 4 | 0 | 3 | 1 | 35.0 | 0 | 0 | 8.0500 | 3 |
x = titanic_data.iloc[:,:-1]
y = titanic_data.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=150)
dtc = DecisionTreeClassifier(criterion = "entropy")
dtc.fit(x_train,y_train)
DecisionTreeClassifier(criterion='entropy')
tree.plot_tree(dtc,filled=True)
plt.show()
y_pred = dtc.predict(x_test)
print("Accuracy score of Decision Tree Classfier is: ",accuracy_score(y_true = y_test,y_pred = y_pred))
print("R2 Score is: ",r2_score(y_test,y_pred))
print("MAE is: ",mean_absolute_error(y_test,y_pred))
print("MSE is: ",mean_squared_error(y_test,y_pred))
Accuracy score of Decision Tree Classfier is: 0.8156424581005587 R2 Score is: 0.1776416539050535 MAE is: 0.3016759776536313 MSE is: 0.5363128491620112